Data Retrieval

download.file(url = “https://mothur.s3.us-east-2.amazonaws.com/wiki/miseqsopdata.zip”, destfile = “~/Labs/BIOL4315_Lab5/data/miseqsopdata.zip”) unzip(“~/Labs/BIOL4315_Lab5/data/miseqsopdata.zip”, exdir = “~/Labs/BIOL4315_Lab5/data”) download.file(url = “https://zenodo.org/record/4587955/files/silva_nr99_v138.1_wSpecies_train_set.fa.gz?download=1”, destfile = “~/Labs/BIOL4315_Lab5/data/silva_nr99_v138.1_wSpecies_train_set.fa.gz”)

gunzip silva_nr99_v138.1_wSpecies_train_set.fa.gz

Question 1

plotQualityProfile(fnFs[1:19])

plotQualityProfile(fnRs[1:19])

mkdir filtered

filtFs <- file.path("~/Labs/BIOL4315_Lab5/data/filtered", paste0(sample.names, "_F_filt.fastq.gz"))
names(filtFs) <- sample.names
filtRs <- file.path("~/Labs/BIOL4315_Lab5/data/filtered", paste0(sample.names, "_R_filt.fastq.gz"))
names(filtRs) <- sample.names
out <- filterAndTrim(fnFs, filtFs, fnRs, filtRs, truncLen=c(240,160), maxN=0, maxEE=c(2,2), truncQ = 2 ,rm.phix=TRUE, compress=TRUE, multithread = TRUE)

Question 2

errF <- learnErrors(filtFs, multithread=T)
## 33514080 total bases in 139642 reads from 20 samples will be used for learning the error rates.
errR <- learnErrors(filtRs, multithread=T)
## 22342720 total bases in 139642 reads from 20 samples will be used for learning the error rates.
plotErrors(errF, nominalQ=TRUE)

plotErrors(errR, nominalQ=TRUE)

dadaFs <- dada(filtFs, err=errF, multithread = TRUE)
## Sample 1 - 7113 reads in 1979 unique sequences.
## Sample 2 - 5299 reads in 1639 unique sequences.
## Sample 3 - 5463 reads in 1477 unique sequences.
## Sample 4 - 2914 reads in 904 unique sequences.
## Sample 5 - 2941 reads in 939 unique sequences.
## Sample 6 - 4312 reads in 1267 unique sequences.
## Sample 7 - 6741 reads in 1756 unique sequences.
## Sample 8 - 4560 reads in 1438 unique sequences.
## Sample 9 - 15637 reads in 3590 unique sequences.
## Sample 10 - 11413 reads in 2762 unique sequences.
## Sample 11 - 12017 reads in 3021 unique sequences.
## Sample 12 - 5032 reads in 1566 unique sequences.
## Sample 13 - 18075 reads in 3707 unique sequences.
## Sample 14 - 6250 reads in 1479 unique sequences.
## Sample 15 - 4052 reads in 1195 unique sequences.
## Sample 16 - 7369 reads in 1832 unique sequences.
## Sample 17 - 4765 reads in 1183 unique sequences.
## Sample 18 - 4871 reads in 1382 unique sequences.
## Sample 19 - 6504 reads in 1709 unique sequences.
## Sample 20 - 4314 reads in 897 unique sequences.
dadaRs <- dada(filtRs, err=errF, multithread = TRUE)
## Sample 1 - 7113 reads in 1660 unique sequences.
## Sample 2 - 5299 reads in 1349 unique sequences.
## Sample 3 - 5463 reads in 1335 unique sequences.
## Sample 4 - 2914 reads in 853 unique sequences.
## Sample 5 - 2941 reads in 880 unique sequences.
## Sample 6 - 4312 reads in 1286 unique sequences.
## Sample 7 - 6741 reads in 1803 unique sequences.
## Sample 8 - 4560 reads in 1265 unique sequences.
## Sample 9 - 15637 reads in 3414 unique sequences.
## Sample 10 - 11413 reads in 2522 unique sequences.
## Sample 11 - 12017 reads in 2771 unique sequences.
## Sample 12 - 5032 reads in 1415 unique sequences.
## Sample 13 - 18075 reads in 3290 unique sequences.
## Sample 14 - 6250 reads in 1390 unique sequences.
## Sample 15 - 4052 reads in 1134 unique sequences.
## Sample 16 - 7369 reads in 1635 unique sequences.
## Sample 17 - 4765 reads in 1084 unique sequences.
## Sample 18 - 4871 reads in 1161 unique sequences.
## Sample 19 - 6504 reads in 1502 unique sequences.
## Sample 20 - 4314 reads in 732 unique sequences.

The error frequencies of the forward and reverse reads look similar.

Merge Paired Ends

mergers <- mergePairs(dadaFs, filtFs, dadaRs, filtRs, verbose=TRUE)
## 6566 paired-reads (in 110 unique pairings) successfully merged out of 6909 (in 202 pairings) input.
## 5029 paired-reads (in 102 unique pairings) successfully merged out of 5190 (in 157 pairings) input.
## 4957 paired-reads (in 79 unique pairings) successfully merged out of 5260 (in 174 pairings) input.
## 2586 paired-reads (in 54 unique pairings) successfully merged out of 2756 (in 118 pairings) input.
## 2544 paired-reads (in 61 unique pairings) successfully merged out of 2772 (in 119 pairings) input.
## 3642 paired-reads (in 56 unique pairings) successfully merged out of 4098 (in 158 pairings) input.
## 6068 paired-reads (in 83 unique pairings) successfully merged out of 6510 (in 202 pairings) input.
## 3965 paired-reads (in 90 unique pairings) successfully merged out of 4382 (in 188 pairings) input.
## 14038 paired-reads (in 144 unique pairings) successfully merged out of 15336 (in 350 pairings) input.
## 10381 paired-reads (in 123 unique pairings) successfully merged out of 11087 (in 271 pairings) input.
## 11027 paired-reads (in 137 unique pairings) successfully merged out of 11642 (in 302 pairings) input.
## 4385 paired-reads (in 88 unique pairings) successfully merged out of 4801 (in 180 pairings) input.
## 17309 paired-reads (in 154 unique pairings) successfully merged out of 17811 (in 275 pairings) input.
## 5849 paired-reads (in 83 unique pairings) successfully merged out of 6092 (in 162 pairings) input.
## 3707 paired-reads (in 86 unique pairings) successfully merged out of 3876 (in 149 pairings) input.
## 6856 paired-reads (in 99 unique pairings) successfully merged out of 7183 (in 188 pairings) input.
## 4428 paired-reads (in 68 unique pairings) successfully merged out of 4609 (in 127 pairings) input.
## 4556 paired-reads (in 100 unique pairings) successfully merged out of 4718 (in 171 pairings) input.
## 6093 paired-reads (in 109 unique pairings) successfully merged out of 6315 (in 174 pairings) input.
## 4269 paired-reads (in 20 unique pairings) successfully merged out of 4281 (in 28 pairings) input.

Frequency Table

seqtab <- makeSequenceTable(mergers)
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread=F, verbose=TRUE)
## Identified 63 bimeras out of 298 input sequences.

Sanity Check

getN <- function(x){ 
  sum(getUniques(x))
}
track <- cbind(out, sapply(dadaFs, getN), sapply(dadaRs, getN), sapply(mergers, getN), rowSums(seqtab.nochim))
colnames(track) <- c("input", "filtered", "denoisedF", "denoisedR", "merged", "nonchim")
rownames(track) <- sample.names
track <- as.data.frame(track) %>% mutate(prop_kept = round(nonchim/input*100,2))
DT::datatable(track)

Polishing Frequency Table

ASV_tbl <- as.data.frame(seqtab.nochim) %>% `colnames<-`(base::paste("ASV", seq(1:ncol(seqtab.nochim)), sep = ""))
ASV_tbl <- as.data.frame(t(ASV_tbl[-nrow(ASV_tbl),]))
ASV_tbl <- ASV_tbl %>% mutate(rs = rowSums(ASV_tbl)) %>% filter(rs > 0) %>% dplyr::select(-rs)
DT::datatable(ASV_tbl, options = list(scrollX = TRUE, autoWidth = TRUE))
ASV_seqs <- DNAStringSet(base::colnames(seqtab.nochim) %>% `names<-`(base::paste("ASV",seq(1:ncol(seqtab.nochim)),sep = "")))

Assign Taxonomy

taxa <- assignTaxonomy(seqtab.nochim, "~/Labs/BIOL4315_Lab5/data/silva_nr99_v138.1_wSpecies_train_set.fa.gz", multithread=TRUE)
taxa2 <- as.data.frame(taxa) %>% `row.names<-`(base::paste("ASV", seq(1:nrow(taxa)), sep = ""))
DT::datatable(taxa2,options = list(scrollX = TRUE, autoWidth = TRUE))

Polishing Taxonomy Table

taxa2 <- taxa2 %>% filter(Family != "Mitochondria") %>% filter(Order != "Chloroplast")
taxa3 <- taxa2 %>% mutate(ASVs = row.names(taxa2)) %>% dplyr::select(ASVs, everything())
ASV_tbl2 <- ASV_tbl %>% mutate(ASVs=row.names(ASV_tbl)) %>% dplyr::select(ASVs, everything())
taxa3 <- taxa3 %>% inner_join(ASV_tbl2, by = "ASVs")
taxa2 <- taxa2 %>% filter(rownames(taxa2)%in%taxa3$ASVs)
ASV_tbl <- ASV_tbl %>% filter(row.names(ASV_tbl) %in% taxa3$ASVs)
taxa3[,(ncol(taxa2)+2):ncol(taxa3)] <- sweep(taxa3[,(ncol(taxa2)+2):ncol(taxa3)],2, colSums(taxa3[,(ncol(taxa2)+2):ncol(taxa3)]),'/')*100

Exporting Microbiome

ASV_seqs[names(ASV_seqs)%in%ASV_tbl2$ASVs]
## DNAStringSet object of length 221:
##       width seq                                             names               
##   [1]   252 TACGGAGGATGCGAGCGTTATC...GAAAGTGTGGGTATCGAACAGG ASV1
##   [2]   252 TACGGAGGATGCGAGCGTTATC...GAAAGCGTGGGTATCGAACAGG ASV2
##   [3]   252 TACGGAGGATGCGAGCGTTATC...GAAAGCGTGGGTATCGAACAGG ASV3
##   [4]   252 TACGGAGGATGCGAGCGTTATC...GAAAGTGCGGGGATCGAACAGG ASV4
##   [5]   253 TACGGAGGATCCGAGCGTTATC...GAAAGTGTGGGTATCAAACAGG ASV5
##   ...   ... ...
## [217]   253 TACGTAGGGAGCGAGCGTTATC...GAAAGTGTGGGGAGCAAACAGG ASV231
## [218]   252 TACGTAGGGAGCGAGCGTTATC...GAAAGCTGGGGGAGCGAACAGG ASV232
## [219]   253 TACGGAGGATGCGAGCGTTATC...GAAAGCGTGGGGAGCAAACAGG ASV233
## [220]   252 TACGGAGGGGGCGAGCGTTATC...GAAAGCTAGGGGAGCGAACAGG ASV234
## [221]   252 TACGTAGGGGGCGAGCGTTATC...GAAAGCGTGGGGAGCAAATAGG ASV235
writeXStringSet(ASV_seqs[names(ASV_seqs)%in%ASV_tbl2$ASVs], "~/Labs/BIOL4315_Lab5/output/microbiome.fasta")

Question 3

ASV184 Firmicutes and ASV207 Actinobacteriota could not be resolved down to genus/species.

ASV184

TACGTAGGGAGCAAGCGTTGTCCGGAATTACTGGGTGTAAAGGGAGCGTAGGCGGGAGGATAAGTTGAATGTGAAATCTA TGGGCTCAACCCATAGCTGCGTTCAAAACTGTTCTTCTTGAGTGAAGTAGAGGCAGGCGGAATTCCTAGTGTAGCGGTGA AATGCGTAGATATTAGGAGGAACACCAGTGGCGAAGGCGGCCTGCTGGGCTTTTACTGACGCTGAGGCTCGAAAGCGTGG GTAGCAAACAGG

ASV207

TACGTAGGGGGCGAGCGTTATCCGGATTCATTGGGCGTAAAGCGCGCGCAGGCGGACTCATAAGCGGAGCCTTTAATCTT GGGGCTTAACCTCAAGTCGGGCCCCGAACTGTGAGTCTCGAGTGTGGTAGGGGAAGGCGGAATTCCCGGTGTAGCGGTGG AATGCGCAGATATCGGGAAGAACACCGATGGCGAAGGCAGCCTTCTGGGCCATCACTGACGCTGAGGCGCGAAAGCTAGG GGAGCAAACAGG

Using a BLAST search of the sequences the following results were obtained:

The top hits for ASV184 and ASV207 were M. coli and P. caecicola respectively.

Phyloseq

mdta <- read.delim(base::paste(data_dir,"mouse.dpw.metadata", sep = "/" ))
mdta <- mdta %>% mutate(category = if_else(dpw > 100 , "Late","Early")) %>% mutate(gender = if_else(str_detect(group, "F"),"F","M")) %>% mutate(subject = str_extract(str_split_fixed(group, "D", 2)[,1],"(\\d)+")) %>% `row.names<-`(mdta$group)
ps <- phyloseq(otu_table(as.matrix(ASV_tbl), taxa_are_rows=T), sample_data(mdta), tax_table(as.matrix(taxa2)))
ps <- merge_phyloseq(ps, ASV_seqs)
ps
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 198 taxa and 19 samples ]
## sample_data() Sample Data:       [ 19 samples by 5 sample variables ]
## tax_table()   Taxonomy Table:    [ 198 taxa by 7 taxonomic ranks ]
## refseq()      DNAStringSet:      [ 198 reference sequences ]

Alpha Diversity

Question 4 & 5

rnd <- estimate_richness( ps, measures=c("Observed", "ACE","Shannon", "Simpson")) %>% dplyr::select(-se.ACE)
DT::datatable(rnd[1:2])
df = as.data.frame(rnd[1:2])
write.csv(df, "~/Labs/BIOL4315_Lab5/output/alpha_diversity.csv")
rnd <- rnd %>% mutate(group = row.names(rnd))%>% inner_join(mdta, "group")
rndl <- rnd %>% gather("Estimator", "Value", 1:4) %>% mutate(rd = if_else(Estimator == "Observed" | Estimator == "ACE", "Richness","Diversity"))
library("RColorBrewer")
p <- ggplot(rndl, aes(x=dpw, y=Value, color = Estimator, fill = Estimator))+
  geom_bar(stat = "identity", position = position_dodge())+
  facet_grid(rd~category, scales = "free")
p + theme_bw() + scale_fill_brewer(palette = "Dark2") + scale_color_brewer(palette = "Greys")